1 Executive Summary

  1. NBA player Salary based on their statistics
  2. Use glm model
  3. use neural network
  4. xgboosting

2 Introduction

3 Loading and Exploring Data

3.1 Loading libraries required

library(knitr)
library(plyr)
library(dplyr)
library(tidyr)
library(caret)
library(ggplot2)
library(corrplot)
library(stringr)
library(scales)
library(randomForest)
library(psych)
library(glmnet)
library(rpart)
library(lubridate)
library(plotly)
library(forcats)
library(ggExtra)
opts_chunk$set(echo = TRUE, cache = TRUE)
opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE, fig.height = 6, fig.width = 9)

3.2 Loading data files

pgstats <- read.csv("files/2022/player_PGstats_2021.csv")
adstats <- read.csv("files/2022/player_Adstats_2021.csv")
salary <- read.csv("files/2022/salary2022.csv")

3.3 File description

3.3.1 player_PGstats_2021.csv

NBA players statistics per game in 2021-2022 season
source: https://www.basketball-reference.com/leagues/NBA_2022_per_game.html

dim(pgstats)
## [1] 812  31
str(pgstats)
## 'data.frame':    812 obs. of  31 variables:
##  $ Rk       : int  1 2 3 4 5 6 6 6 7 8 ...
##  $ Player   : chr  "Precious Achiuwa" "Steven Adams" "Bam Adebayo" "Santi Aldama" ...
##  $ Pos      : chr  "C" "C" "C" "PF" ...
##  $ Age      : int  22 28 24 21 36 23 23 23 26 23 ...
##  $ Tm       : chr  "TOR" "MEM" "MIA" "MEM" ...
##  $ G        : int  73 76 56 32 47 65 50 15 66 56 ...
##  $ GS       : int  28 75 56 0 12 21 19 2 61 56 ...
##  $ MP       : num  23.6 26.3 32.6 11.3 22.3 22.6 26.3 9.9 27.3 32.3 ...
##  $ FG       : num  3.6 2.8 7.3 1.7 5.4 3.9 4.7 1.1 3.9 6.6 ...
##  $ FGA      : num  8.3 5.1 13 4.1 9.7 10.5 12.6 3.2 8.6 9.7 ...
##  $ FG.      : num  0.439 0.547 0.557 0.402 0.55 0.372 0.375 0.333 0.448 0.677 ...
##  $ X3P      : num  0.8 0 0 0.2 0.3 1.6 1.9 0.7 2.4 0 ...
##  $ X3PA     : num  2.1 0 0.1 1.5 1 5.2 6.1 2.2 5.9 0.2 ...
##  $ X3P.     : num  0.359 0 0 0.125 0.304 0.311 0.311 0.303 0.409 0.1 ...
##  $ X2P      : num  2.9 2.8 7.3 1.5 5.1 2.3 2.8 0.4 1.5 6.6 ...
##  $ X2PA     : num  6.1 5 12.9 2.6 8.8 5.3 6.5 1 2.7 9.6 ...
##  $ X2P.     : num  0.468 0.548 0.562 0.56 0.578 0.433 0.434 0.4 0.533 0.688 ...
##  $ eFG.     : num  0.486 0.547 0.557 0.424 0.566 0.449 0.45 0.438 0.588 0.678 ...
##  $ FT       : num  1.1 1.4 4.6 0.6 1.9 1.2 1.4 0.7 1 2.9 ...
##  $ FTA      : num  1.8 2.6 6.1 1 2.2 1.7 1.9 0.8 1.1 4.2 ...
##  $ FT.      : num  0.595 0.543 0.753 0.625 0.873 0.743 0.722 0.917 0.865 0.708 ...
##  $ ORB      : num  2 4.6 2.4 1 1.6 0.6 0.7 0.1 0.5 3.4 ...
##  $ DRB      : num  4.5 5.4 7.6 1.7 3.9 2.3 2.6 1.5 2.9 7.3 ...
##  $ TRB      : num  6.5 10 10.1 2.7 5.5 2.9 3.3 1.5 3.4 10.8 ...
##  $ AST      : num  1.1 3.4 3.4 0.7 0.9 2.4 2.8 1.1 1.5 1.6 ...
##  $ STL      : num  0.5 0.9 1.4 0.2 0.3 0.7 0.8 0.3 0.7 0.8 ...
##  $ BLK      : num  0.6 0.8 0.8 0.3 1 0.4 0.4 0.3 0.3 1.3 ...
##  $ TOV      : num  1.2 1.5 2.6 0.5 0.9 1.4 1.7 0.5 0.7 1.7 ...
##  $ PF       : num  2.1 2 3.1 1.1 1.7 1.6 1.8 1 1.5 1.7 ...
##  $ PTS      : num  9.1 6.9 19.1 4.1 12.9 10.6 12.8 3.5 11.1 16.1 ...
##  $ player_id: chr  "achiupr01" "adamsst01" "adebaba01" "aldamsa01" ...

3.3.2 player_Adstats_2021.csv

player_Adstats_2021.csv – NBA players advance statistics in 2021-2022 season source: https://www.basketball-reference.com/leagues/NBA_2022_advanced.html

dim(adstats)
## [1] 812  30
str(adstats)
## 'data.frame':    812 obs. of  30 variables:
##  $ Rk       : int  1 2 3 4 5 6 6 6 7 8 ...
##  $ Player   : chr  "Precious Achiuwa" "Steven Adams" "Bam Adebayo" "Santi Aldama" ...
##  $ Pos      : chr  "C" "C" "C" "PF" ...
##  $ Age      : int  22 28 24 21 36 23 23 23 26 23 ...
##  $ Tm       : chr  "TOR" "MEM" "MIA" "MEM" ...
##  $ G        : int  73 76 56 32 47 65 50 15 66 56 ...
##  $ MP       : int  1725 1999 1825 360 1050 1466 1317 149 1805 1809 ...
##  $ PER      : num  12.7 17.6 21.8 10.2 19.6 10.5 10.5 10.2 12.7 23 ...
##  $ TS.      : num  0.503 0.56 0.608 0.452 0.604 0.475 0.474 0.497 0.609 0.698 ...
##  $ X3PAr    : num  0.259 0.003 0.008 0.364 0.1 0.497 0.483 0.688 0.684 0.018 ...
##  $ FTr      : num  0.217 0.518 0.466 0.242 0.223 0.16 0.153 0.25 0.13 0.428 ...
##  $ ORB.     : num  8.7 17.9 8.7 9.4 7.8 2.7 3 0.8 1.9 12 ...
##  $ DRB.     : num  21.7 22 26.1 16.1 18.7 11.5 11 15.6 10.9 24.5 ...
##  $ TRB.     : num  14.9 19.9 17.5 12.6 13.4 7.1 6.9 8.5 6.5 18.4 ...
##  $ AST.     : num  6.9 16.1 17.5 7.7 6.3 16.1 16.1 15.5 7.6 8.2 ...
##  $ STL.     : num  1.1 1.6 2.2 0.8 0.6 1.5 1.5 1.7 1.2 1.2 ...
##  $ BLK.     : num  2.3 2.7 2.6 2.5 4 1.5 1.4 2.4 1 3.7 ...
##  $ TOV.     : num  11.3 19.6 14.4 9.9 8 11.3 11.2 13.1 6.7 12.7 ...
##  $ USG.     : num  18.5 12 25 18.4 22.4 24.1 24.8 17.9 15.2 18.1 ...
##  $ X        : logi  NA NA NA NA NA NA ...
##  $ OWS      : num  0.4 3.8 3.6 -0.1 2.1 -1.1 -1.1 0 2.8 5.4 ...
##  $ DWS      : num  2.1 3 3.5 0.4 1 1.1 0.9 0.2 1.4 3 ...
##  $ WS       : num  2.5 6.8 7.2 0.3 3.1 0.1 -0.1 0.2 4.2 8.5 ...
##  $ WS.48    : num  0.07 0.163 0.188 0.044 0.141 0.003 -0.005 0.07 0.11 0.225 ...
##  $ X.1      : logi  NA NA NA NA NA NA ...
##  $ OBPM     : num  -2 1 1.7 -4.2 1.3 -1.8 -1.7 -2.9 0.6 2.7 ...
##  $ DBPM     : num  -0.6 1 2.1 -1.5 -0.6 -1.1 -1.3 1.2 -0.2 1.2 ...
##  $ BPM      : num  -2.6 2 3.8 -5.7 0.7 -2.9 -3 -1.7 0.4 3.9 ...
##  $ VORP     : num  -0.2 2 2.7 -0.3 0.7 -0.3 -0.3 0 1.1 2.7 ...
##  $ player_id: chr  "achiupr01" "adamsst01" "adebaba01" "aldamsa01" ...

3.3.3 salary2022.csv

salary2022.csv – NBA players contract in 2022-2023 season onward
source: https://www.basketball-reference.com/contracts/players.html

dim(salary)
## [1] 448  12
str(salary)
## 'data.frame':    448 obs. of  12 variables:
##  $ Rk          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Player      : chr  "Stephen Curry" "Russell Westbrook" "LeBron James" "Kevin Durant" ...
##  $ Tm          : chr  "GSW" "LAL" "LAL" "BRK" ...
##  $ X2022.23    : chr  "$48070014" "$47063478" "$44474988" "$44119845" ...
##  $ X2023.24    : chr  "$51915615" "" "" "$46407433" ...
##  $ X2024.25    : chr  "$55761216" "" "" "$49856021" ...
##  $ X2025.26    : chr  "$59606817" "" "" "$53282609" ...
##  $ X2026.27    : chr  "" "" "" "" ...
##  $ X2027.28    : chr  "" "" "" "" ...
##  $ Signed.Using: chr  "Bird" "Bird Rights" "Bird" "Bird" ...
##  $ Guaranteed  : chr  "$215353662" "$47063478" "$44474988" "$193665908" ...
##  $ player_id   : chr  "curryst01" "westbru01" "jamesle01" "duranke01" ...

4 Preprocessing Data

4.1 Merge data tables

I will merge the tables by their primary key (pgstats.player_id) and foreign key (salary.player_id) by inner join (only take the entries which exist). I will treat the players that received salary but have not played any game as outliers.

merged <- merge(pgstats, adstats, by = c("player_id", "Tm"))

Since there is players that has changed team in the middle of the season, I will merge the stats together by taking the weighted mean of per game stats and sum of cumulative stats. I will find the variable where both are always the same.

pgName <- names(pgstats)
same <- c()
for (i in 1:length(pgName)) {
    if (!pgName[i] %in% names(adstats)) {
        same[i] <- FALSE
        next
    }
    if (all(pgstats[, pgName[i]] == adstats[, pgName[i]])) {
        same[i] <- TRUE
    } else {
        same[i] <- FALSE
    }
}
pgName[same]
## [1] "Rk"        "Player"    "Pos"       "Age"       "Tm"        "G"        
## [7] "player_id"

For the team variable, I will take the last team it was in, as it should have the largest effect

merged <- merged %>%
    group_by(player_id) %>%
    summarise(Tm = Tm[length(Tm)], Rk = Rk.x[1], Player = Player.x[1],
        Position = Pos.x[1], Age = Age.x[1], Game_played = sum(G.x),
        Game_started = sum(GS), MP = sum(MP.y)/sum(G.x), FG = weighted.mean(FG,
            G.x), FGA = weighted.mean(FGA, G.x), FGpct = weighted.mean(FG.,
            G.x), X3P = weighted.mean(X3P, G.x), X3PA = weighted.mean(X3PA,
            G.x), X3Ppct = weighted.mean(X3P., G.x), X2P = weighted.mean(X2P,
            G.x), X2PA = weighted.mean(X2PA, G.x), X2Ppct = weighted.mean(X2P.,
            G.x), eFGpct = weighted.mean(eFG., G.x), FT = weighted.mean(FT,
            G.x), FTA = weighted.mean(FTA, G.x), FTpct = weighted.mean(FT.,
            G.x), ORB = weighted.mean(ORB, G.x), DRB = weighted.mean(DRB,
            G.x), AST = weighted.mean(AST, G.x), STL = weighted.mean(STL,
            G.x), BLK = weighted.mean(BLK, G.x), TOV = weighted.mean(TOV,
            G.x), PF = weighted.mean(PF, G.x), PTS = weighted.mean(PTS,
            G.x), PER = weighted.mean(PER, MP.x), TSpct = weighted.mean(TS.,
            G.x), X3PAr = weighted.mean(X3PAr, G.x), FTr = weighted.mean(FTr,
            G.x), ORBpct = weighted.mean(ORB., G.x), DRBpct = weighted.mean(DRB.,
            G.x), TRBpct = weighted.mean(TRB., G.x), ASTpct = weighted.mean(AST.,
            G.x), STLpct = weighted.mean(STL., G.x), BLKpct = weighted.mean(BLK.,
            G.x), TOVpct = weighted.mean(TOV., G.x), USGpct = weighted.mean(USG.,
            G.x), OWS = sum(OWS), DWS = sum(DWS), WS = sum(WS),
        WSper48 = weighted.mean(WS.48, MP.x), OBPM = weighted.mean(OBPM,
            G.x), DBPM = weighted.mean(DBPM, G.x), BPM = weighted.mean(BPM,
            G.x), VORP = mean(VORP))
merged <- merge(merged, salary, by = c("player_id"))

4.2 Save table

write.csv(merged, "dataset/all2022.csv")

4.2.1 Read in the saved table

all <- read.csv("dataset/all2022.csv")
dim(all)
## [1] 390  62

There are 390 entries.

str(all)
## 'data.frame':    390 obs. of  62 variables:
##  $ X           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ player_id   : chr  "achiupr01" "adamsst01" "adebaba01" "aldamsa01" ...
##  $ Tm.x        : chr  "TOR" "MEM" "MIA" "MEM" ...
##  $ Rk.x        : int  1 2 3 4 6 7 8 9 11 12 ...
##  $ Player.x    : chr  "Precious Achiuwa" "Steven Adams" "Bam Adebayo" "Santi Aldama" ...
##  $ Position    : chr  "C" "C" "C" "PF" ...
##  $ Age         : int  22 28 24 21 23 26 23 23 28 27 ...
##  $ Game_played : int  73 76 56 32 130 66 56 54 69 67 ...
##  $ Game_started: int  28 75 56 0 42 61 56 1 11 67 ...
##  $ MP          : num  23.6 26.3 32.6 11.2 22.6 ...
##  $ FG          : num  3.6 2.8 7.3 1.7 3.88 ...
##  $ FGA         : num  8.3 5.1 13 4.1 10.5 ...
##  $ FGpct       : num  0.439 0.547 0.557 0.402 0.369 ...
##  $ X3P         : num  0.8 0 0 0.2 1.61 ...
##  $ X3PA        : num  2.1 0 0.1 1.5 5.2 5.9 0.2 2 1.6 3.6 ...
##  $ X3Ppct      : num  0.359 0 0 0.125 0.31 ...
##  $ X2P         : num  2.9 2.8 7.3 1.5 2.27 ...
##  $ X2PA        : num  6.1 5 12.9 2.6 5.27 ...
##  $ X2Ppct      : num  0.468 0.548 0.562 0.56 0.43 ...
##  $ eFGpct      : num  0.486 0.547 0.557 0.424 0.448 ...
##  $ FT          : num  1.1 1.4 4.6 0.6 1.22 ...
##  $ FTA         : num  1.8 2.6 6.1 1 1.67 ...
##  $ FTpct       : num  0.595 0.543 0.753 0.625 0.755 0.865 0.708 0.679 0.638 0.722 ...
##  $ ORB         : num  2 4.6 2.4 1 0.581 ...
##  $ DRB         : num  4.5 5.4 7.6 1.7 2.32 ...
##  $ AST         : num  1.1 3.4 3.4 0.7 2.4 ...
##  $ STL         : num  0.5 0.9 1.4 0.2 0.692 ...
##  $ BLK         : num  0.6 0.8 0.8 0.3 0.388 ...
##  $ TOV         : num  1.2 1.5 2.6 0.5 1.41 ...
##  $ PF          : num  2.1 2 3.1 1.1 1.61 ...
##  $ PTS         : num  9.1 6.9 19.1 4.1 10.6 ...
##  $ PER         : num  12.7 17.6 21.8 10.2 10.4 ...
##  $ TSpct       : num  0.503 0.56 0.608 0.452 0.477 ...
##  $ X3PAr       : num  0.259 0.003 0.008 0.364 0.514 ...
##  $ FTr         : num  0.217 0.518 0.466 0.242 0.168 ...
##  $ ORBpct      : num  8.7 17.9 8.7 9.4 2.6 ...
##  $ DRBpct      : num  21.7 22 26.1 16.1 11.8 ...
##  $ TRBpct      : num  14.9 19.9 17.5 12.6 7.18 ...
##  $ ASTpct      : num  6.9 16.1 17.5 7.7 16 ...
##  $ STLpct      : num  1.1 1.6 2.2 0.8 1.52 ...
##  $ BLKpct      : num  2.3 2.7 2.6 2.5 1.57 ...
##  $ TOVpct      : num  11.3 19.6 14.4 9.9 11.5 ...
##  $ USGpct      : num  18.5 12 25 18.4 23.7 ...
##  $ OWS         : num  0.4 3.8 3.6 -0.1 -2.2 2.8 5.4 1 1 9.2 ...
##  $ DWS         : num  2.1 3 3.5 0.4 2.2 1.4 3 1.1 2.5 3.7 ...
##  $ WS          : num  2.5 6.8 7.2 0.3 0.2 4.2 8.5 2.1 3.5 12.9 ...
##  $ WSper48     : num  0.07 0.163 0.188 0.044 0.0107 ...
##  $ OBPM        : num  -2 1 1.7 -4.2 -1.89 ...
##  $ DBPM        : num  -0.6 1 2.1 -1.5 -0.912 ...
##  $ BPM         : num  -2.6 2 3.8 -5.7 -2.8 0.4 3.9 1.9 1.2 11.2 ...
##  $ VORP        : num  -0.2 2 2.7 -0.3 -0.2 1.1 2.7 0.8 1.2 7.4 ...
##  $ Rk.y        : int  301 76 33 349 226 165 62 418 159 6 ...
##  $ Player.y    : chr  "Precious Achiuwa" "Steven Adams" "Bam Adebayo" "Santi Aldama" ...
##  $ Tm.y        : chr  "TOR" "MEM" "MIA" "MEM" ...
##  $ X2022.23    : chr  "$2840160" "$17926829" "$30351834" "$2094240" ...
##  $ X2023.24    : chr  "$4379527" "" "$32600118" "$2194200" ...
##  $ X2024.25    : chr  "" "" "$34848402" "$3960531" ...
##  $ X2025.26    : chr  "" "" "$37096686" "" ...
##  $ X2026.27    : chr  "" "" "" "" ...
##  $ X2027.28    : chr  "" "" "" "" ...
##  $ Signed.Using: chr  "1st Round Pick" "1st Round Pick" "Bird" "1st Round Pick" ...
##  $ Guaranteed  : chr  "$2840160" "$17926829" "$134897040" "$2094240" ...

4.3 Remove repeated variables

4.3.1 Player Name

sum(all$Player.x != all$Player.y)
## [1] 0

Since there is no difference in the player name, I will remove player.y and renaming player.x to name

all <- all %>%
    select(!Player.y) %>%
    rename(name = Player.x)

4.3.2 Rank

It is rank in their original respective table (alphabetical order of player name in player statistics tables and salary in 2022-23 season for salary table).
Since, it doesn’t carry any extra information, I will remove both of the variables.

all <- all %>%
    select(!c(Rk.x, Rk.y))

4.3.3 Team

Tm.x is the team the player in in 2021-22 season while Tm.y is the team of 2022-23 season. I will change Tm.x to team_2021 and Tm.y to team_2022.

all <- all %>%
    rename(team_2021 = Tm.x, team_2022 = Tm.y)

5 Exploratory analysis

5.1 The response variable: salary

The aim of this project is to predict the salary next year. I will remove the salary of 2023-24 season onward and change the X2022.23 to numeric variables.

all <- all %>%
    select(!c(X2023.24, X2024.25, X2025.26, X2026.27, X2027.28)) %>%
    rename(salary = X2022.23) %>%
    mutate(salary = as.numeric(str_extract(salary, "[0-9]+")))
plot_ly(data = all, x = ~salary, type = "histogram", nbinsx = 30) %>%
    layout(title = "Frequency Diagram of NBA salary in 2022-23 season",
        xaxis = list(title = "yearly salary (USD)"), yaxis = list(title = "frequency"))
## Warning: Ignoring 1 observations

The salary is highly right skewed and there is high frequency concentrated on 0 to 2 million range. This might be because of the existence of minimum salary in NBA, which is 1 million to 3 million per year depending on their experience (Adams, 2022). I will keep that in mind.

summary(all$salary)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##   333333  2201400  5954454 10155507 14004703 48070014        1

5.2 Important Numeric Variables

I will first use the correlation with salary to get a feel on the numeric variables on the response variables

5.2.1 Correlation with salary 2022-23

numVar <- which(sapply(all, is.numeric))
numVarNames <- names(numVar)
length(numVarNames)
## [1] 47

There are 47 numeric variables

all_numVar <- all[, numVar]
all_numVar <- select(all_numVar, !X)

cor_Mat <- cor(all_numVar, use = "pairwise.complete.obs")

cor_names <- names(sort(cor_Mat[, "salary"], decreasing = TRUE))[1:20]

cor_Mat <- cor_Mat[cor_names, cor_names]

corrplot.mixed(cor_Mat, tl.pos = "lt")

5.2.2 Points

Pts:

Points per game

It has the highest correlation with salary among the numeric variables (0.7908116). It is the average point per game played.

ggplotly(ggplot(all %>%
    drop_na(PTS, salary), aes(x = PTS, y = salary)) + geom_point(col = "blue") +
    geom_smooth(formula = y ~ x, method = "loess") + labs(title = "points per game in NBA 2021-22 vs salary in NBA 2022-23",
    x = "points per game", y = "yearly salary (USD)"))

There is a clear linear correlation between salary and points per game. The correlation is smaller when the points per game is below about 9 but increase after it goes above 9 points.

ggplotly(ggplot(all %>%
    drop_na(PTS, salary), aes(x = PTS)) + geom_histogram(bins = 30) +
    labs(title = "Frequency Distribution of points per game in NBA 2021-22",
        x = "points per game", y = "Frequency"))

5.2.3 Value Over Replacement player

Value Over Replacement Player:

VORP - Value Over Replacement Player (available since the 1973-74 season in the NBA); a box score estimate of the points per 100 TEAM possessions that a player contributed above a replacement-level (-2.0) player, translated to an average team and prorated to an 82-game season.

Although FG, FGA, FT etc are more highly correlated, they are also highly correlated to points per game (> 0.75). I will look at the next one that is not highly correlated to points per game. It has a correlation of (0.6619577) with salary.

ggplotly(ggplot(all %>%
    drop_na(VORP, salary), aes(x = VORP, y = salary)) + geom_point(col = "blue") +
    geom_smooth(formula = y ~ x, method = "loess") + geom_smooth(formula = y ~
    x, method = "glm", linetype = "dotted", col = "red", se = FALSE) +
    labs(title = "Value over replacement player in NBA 2021-22 vs salary in NBA 2022-23",
        x = "VORP", y = "yearly salary (USD)"))

It shows clear linear correlation except some in both extreme of the VORP.

5.2.4 Assists

Assists:

Assists per game

It has a high correlation with salary while not a having such a high correlation with points per game. It has a correlation of (0.6114318) with salary.

ggplotly(ggplot(all %>%
    drop_na(AST, salary), aes(x = AST, y = salary)) + geom_point(col = "blue") +
    geom_smooth(formula = y ~ x, method = "loess") + geom_smooth(formula = y ~
    x, method = "glm", linetype = "dotted", col = "red", se = FALSE) +
    labs(title = "Assists per game in NBA 2021-22 vs salary in NBA 2022-23",
        x = "Assists per game", y = "yearly salary (USD)"))

It show positive correlation until it goes above 6 assists per game where it shows negative correlation. This maybe explained by that the players with high assist are usually not the first attacking choice of the team which might explain by they are pay less.

6 Imputing missing data and factorising character variables.

6.1 Impute missing data

Nacol <- names(which(colSums(is.na(all) | all == "") > 0))
sort(colSums(sapply(all[Nacol], function(x) is.na(x) | x == "")),
    decreasing = TRUE)
## Signed.Using   Guaranteed       X3Ppct        FTpct       X2Ppct        FGpct 
##           51           37           18            9            6            2 
##       eFGpct        TSpct        X3PAr          FTr       TOVpct       salary 
##            2            2            2            2            2            1

6.1.1 Salary

kable(all[is.na(all$salary), c("X", "name", "salary")])
X name salary
366 366 Ish Wainright NA

The salary of Ish Wainright is 125000 USD spotrac (n.d.).

all$salary[all$X == 366] <- 125000

6.1.2 Signed.Using

Signed.Using:

The type of contract use to sign

I will impute by changing all NA to “None”.

unique(all$Signed.Using)
##  [1] "1st Round Pick"      "Bird"                "MLE"                
##  [4] "Minimum Salary"      ""                    "Sign and Trade"     
##  [7] "Bird Rights"         "Early Bird"          "Cap Space"          
## [10] "1st round pick"      "Mini MLE"            "1st Round pick"     
## [13] "Bi-Annual Exception" "Non Bird"            "Cap space"          
## [16] "Room Exception"
all$Signed.Using[grep("^1st [Rr]ound [Pp]ick", all$Signed.Using)] <- "1st round pick"
all$Signed.Using[grep("Cap [Ss]pace", all$Signed.Using)] <- "Cap space"
all$Signed.Using[is.na(all$Signed.Using) | all$Signed.Using ==
    ""] <- "None"

ggplotly(ggplot(all, aes(x = fct_reorder(as.factor(Signed.Using),
    salary, .fun = "mean"), y = salary, fill = Signed.Using)) +
    geom_boxplot() + geom_point(stat = "summary", fun = "mean") +
    labs(title = "Type of contract vs salary", x = "Type of contract",
        y = "yearly salary (USD)") + theme(axis.text.x = element_text(angle = 45,
    hjust = 1)))

6.1.3 Guaranteed

Guaranteed:

The amount of a player's remaining salary that is guarenteed.

Since it is a direct indication of the salary, I will remove this variable.

all <- select(all, !Guaranteed)

6.1.4 X3Ppct

X3Ppct:

3 point field goal percentage
kable(all[which(is.na(all$X3Ppct)), c("X3P", "X3PA", "X3Ppct")])
X3P X3PA X3Ppct
16 0.0000000 0.0000000 NA
39 0.0000000 0.0000000 NA
49 0.0000000 0.0000000 NA
58 0.0000000 0.0000000 NA
72 0.0000000 0.0000000 NA
81 0.0000000 0.0000000 NA
121 0.2942308 1.0807692 NA
195 0.0000000 0.0000000 NA
196 0.0000000 0.0000000 NA
204 0.0000000 0.3033333 NA
257 0.5000000 1.0000000 NA
277 0.0000000 0.0000000 NA
310 0.0000000 0.0000000 NA
313 0.0000000 0.0000000 NA
327 0.0000000 0.0000000 NA
331 0.0000000 0.0000000 NA
341 0.0000000 0.0000000 NA
390 0.0000000 0.0000000 NA

I will impute by setting to 0 if there is no 3 point attempt.

all$X3Ppct[which(is.na(all$X3Ppct))] <- sapply(which(is.na(all$X3Ppct)),
    function(x) ifelse(all$X3PA[x] == 0, 0, all$X3P[x]/all$X3PA[x]))
ggplotly(ggplot(all, aes(x = X3Ppct, y = salary)) + geom_point() +
    geom_smooth(col = "red", formula = y ~ x, method = "glm") +
    labs(title = "3 point percentage vs salary", x = "3 point field goal percentage",
        y = "yearly salary (USD)"))

It shows a slight but not significant correlation between salary and 3 point percentage.

ggplotly(ggplot(all, aes(x = X3Ppct, y = salary, col = Position)) +
    geom_point() + geom_smooth(formula = y ~ x, method = "glm") +
    facet_grid(Position ~ .) + labs(title = "3 point percentage vs salary",
    x = "3 point field goal percentage", y = "yearly salary (USD)"))

6.1.5 FTpct

FTpct:

Free throw percentage
kable(all[which(is.na(all$FTpct)), c("FT", "FTA", "FTpct")])
FT FTA FTpct
115 0.0000000 0.0000000 NA
121 0.9730769 1.6115385 NA
154 0.0000000 0.0000000 NA
169 1.0369048 1.4047619 NA
204 0.2966667 0.4200000 NA
222 0.2166667 0.2166667 NA
257 0.0000000 0.0000000 NA
331 0.0000000 0.0000000 NA
341 0.0000000 0.0000000 NA

I will impute by setting to 0 if there is no free throw attempt.

all$FTpct[which(is.na(all$FTpct))] <- sapply(which(is.na(all$FTpct)),
    function(x) ifelse(all$FTA[x] == 0, 0, all$FT[x]/all$FTA[x]))
ggplotly(ggplot(all, aes(x = FTpct, y = salary)) + geom_point() +
    geom_smooth(formula = y ~ x, method = "glm") + labs(title = "Free throw percentage vs salary",
    x = "free throw percentage", y = "yearly salary (USD)"))

6.1.6 X2pct

2-Point Field Goal Percentage

kable(all[is.na(all$X2Ppct), c("name", "X2P", "X2PA", "X2Ppct")])
name X2P X2PA X2Ppct
121 Wenyen Gabriel 1.7980769 3.175000 NA
169 Danuel House Jr. 0.8595238 1.902381 NA
222 Didi Louzada 0.3055556 1.094444 NA
257 Juwan Morgan 0.5000000 0.500000 NA
332 Nik Stauskas 0.1125000 0.500000 NA
355 Rayjon Tucker 0.4000000 0.800000 NA
all$X2Ppct[is.na(all$X2Ppct)] <- all$X2P[is.na(all$X2Ppct)]/all$X2PA[is.na(all$X2Ppct)]

6.1.7 FGpct

Field Goal Percentage

kable(all[is.na(all$FGpct), c("FG", "FGA", "FGpct")])
FG FGA FGpct
121 2.092308 4.217308 NA
257 1.000000 1.500000 NA
all$FGpct[is.na(all$FGpct)] <- all$FG[is.na(all$FGpct)]/all$FGA[is.na(all$FGpct)]

6.1.8 eFGpct

Effective Field Goal Percentage:

This statistics adjusts for the fact that 3-point field goal is worth one more point than 2-point field goal.

The formula for the effective field goal percentage is (Basketball_Reference, n.d.):
\[ \frac{\text{2-Point Field Goals} + 1.5 \cdot \text{3-Point Field Goals}}{\text{Total Field Goal Attempts}} \]

kable(all[which(is.na(all$eFGpct)), c("X2P", "X3P", "FGA", "eFGpct")])
X2P X3P FGA eFGpct
121 1.798077 0.2942308 4.217308 NA
257 0.500000 0.5000000 1.500000 NA
all$eFGpct[is.na(all$eFGpct)] <- (all$X2P[is.na(all$eFGpct)] +
    1.5 * all$X3P[is.na(all$eFGpct)])/all$FGA[is.na(all$eFGpct)]
ggplotly(ggplot(all, aes(x = eFGpct, y = salary)) + geom_point() +
    geom_smooth(formula = y ~ x, method = "glm") + labs(title = "Effect field goal percentage vs salary",
    x = "effective field goal percentage", y = "yearly salary (USD)"))

6.1.9 TSpct

True Shooting percentage:

True shooting percentage is a measure of shooting efficiency that takes into account field goals, 3-point field goals, and free throws.

Formula (Basketball_Reference, n.d.): \[ \frac{\text{Points}}{2\cdot(\text{Field Goal Attempts}+0.44\cdot\text{Free Throw Attempts})} \]

kable(all[is.na(all$TSpct), c("PTS", "FGA", "FTA")])
PTS FGA FTA
121 5.413462 4.217308 1.611538
257 2.500000 1.500000 0.000000
all$TSpct[is.na(all$TSpct)] <- all$PTS[is.na(all$TSpct)]/(2 *
    all$FGA[is.na(all$TSpct)] + 2 * 0.44 * all$FTA[is.na(all$TSpct)])
ggplotly(ggplot(all, aes(x = TSpct, y = salary)) + geom_point() +
    geom_smooth(formula = y ~ x, method = "glm") + labs(title = "True shooting percentage vs salary",
    x = "True shooting percentage", y = "yearly salary (USD)"))

6.1.10 X3PAr

3-Point Attempt rate

Percentage of field goal attempt from 3-point range
kable(all[is.na(all$X3PAr), c("X3PA", "FGA", "X3PAr")])
X3PA FGA X3PAr
121 1.080769 4.217308 NA
257 1.000000 1.500000 NA
all$X3PAr[is.na(all$X3PAr)] <- all$X3PA[is.na(all$X3PAr)]/all$FGA[is.na(all$X3PAr)]

6.1.11 FTr

Free Throw Attempt Rate:

Number of free throw attempts per field goal attempt
kable(all[is.na(all$FTr), c("FTA", "FGA", "FTr")])
FTA FGA FTr
121 1.611538 4.217308 NA
257 0.000000 1.500000 NA
all$FTr[is.na(all$FTr)] <- all$FTA[is.na(all$FTr)]/all$FGA[is.na(all$FTr)]

6.1.12 TOVpct

Turnover percentage:

An estimate of turnovers committed per 100 plays
kable(all[is.na(all$TOVpct), c("X", "name", "TOV", "TOVpct")])
X name TOV TOVpct
121 121 Wenyen Gabriel 0.6134615 NA
257 257 Juwan Morgan 0.0000000 NA

I will manually impute the data by finding the weighted mean of the turnover percentage by minute played from the original data set (before merging)

all$TOVpct[c(121, 257)] <- c(11.1, 0)

6.2 Factorizing Character Variables

Find all character variables:

chrVar <- names(which(sapply(all, is.character)))
chrVar
## [1] "player_id"    "team_2021"    "name"         "team_2022"    "Signed.Using"

6.2.1 Player Id and playe name

I will keep player id and name for now to keep track of each entries but will remove it before fitting the model

player_name <- all$name
all <- select(all, !c(name, player_id))

6.2.2 Team

ggplotly(ggplot(all, aes(x = fct_reorder(as.factor(team_2021),
    salary, median, .desc = TRUE), y = salary, fill = reorder(as.factor(team_2021),
    salary, .fun = "mean", decreasing = TRUE))) + geom_boxplot() +
    labs(title = "Salary for each team", x = "team in 2021-22",
        y = "yearly salary in 2022-23 (USD)") + theme(axis.text.x = element_text(angle = 45,
    hjust = 1)) + guides(fill = guide_legend(title = "Team")))
ggplotly(ggplot(all, aes(x = fct_reorder(as.factor(team_2022),
    salary, median, .desc = TRUE), y = salary, fill = reorder(as.factor(team_2022),
    salary, .fun = "mean", decreasing = TRUE))) + geom_boxplot() +
    labs(title = "Salary for each team", x = "team in 2022-23",
        y = "yearly salary in 2022-23 (USD)") + theme(axis.text.x = element_text(angle = 45,
    hjust = 1)) + guides(fill = guide_legend(title = "Team")))
cor(as.numeric(fct_reorder(as.factor(all$team_2021), all$salary,
    median)), all$salary)
## [1] 0.2314913
cor(as.numeric(fct_reorder(as.factor(all$team_2022), all$salary,
    median)), all$salary)
## [1] 0.1691273

There is no clear correlation between salary and team as each team will have varying salary for their star players and bench players. I will remove this variable.

all <- select(all, !c(team_2021, team_2022))

6.2.3 Signed.Using

I will also remove this variable as this might be a direct indication to the salary of the player.

all <- select(all, !Signed.Using)

7 Visualization

7.1 The response variable: salary

Although I have already done some visualisation, I will visualize it again.

summary(all$salary)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   125000  2200350  5921176 10129787 13980271 48070014
ggplotly(ggplot(data = all, aes(x = salary)) + geom_histogram(bins = 50) +
    labs(title = "Frequency distribution of salary", x = "yearly salary (USD)",
        y = "frequency") + theme_minimal())
ggplotly(ggplot(data = all, aes(x = "", y = salary)) + geom_boxplot() +
    labs(y = "salary") + coord_flip() + theme_minimal())
g1 <- ggplot(all, aes(x = Age, y = salary)) + geom_point(alpha = 0.7) +
    theme_minimal()
ggMarginal(g1, type = "boxplot")

7.1.1 ???

inTrain <- sample(0:1, nrow(all), replace = TRUE, prob = c(0.8,
    0.2))
train <- all[inTrain == 0, ]
test <- all[inTrain == 1, ]
mod_rf <- randomForest(salary ~ ., train %>%
    drop_na(), ntree = 500, importance = T)
imp_rf <- importance(mod_rf)
imp_df <- data.frame(Variables = row.names(imp_rf), MSE = imp_rf[,
    1])
imp_df <- imp_df[order(imp_df$MSE, decreasing = TRUE), ]

ggplot(imp_df, aes(x = reorder(Variables, MSE), y = MSE, fill = MSE)) +
    geom_bar(stat = "identity") + labs(x = "Variables", y = "% increase MSE if variable is randomly permuted") +
    coord_flip() + theme(legend.position = "none")

pred <- predict(mod_rf, newdata = test)
mod <- lm(salary ~ ., data = train)
Adams, L., 2022. NBA minimum salaries for 2022/23.
Basketball_Reference, n.d. Glossary.
spotrac, n.d. Ishmail wainright.